In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import folktables
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from folktables import ACSDataSource, ACSEmployment, generate_categories, ACSIncome, ACSHealthInsurance
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.pipeline import Pipeline, FeatureUnion
import math
import sklearn
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import scipy as sp
from scipy import stats
import rfpimp
import dalex as dx
import warnings
warnings.filterwarnings("ignore")
import xgboost as xg
In [2]:
ACSIncome_categories = {
    "COW": {
        1.0: (
            "Employee of a private for-profit company or"
            "business, or of an individual, for wages,"
            "salary, or commissions"
        ),
        2.0: (
            "Employee of a private not-for-profit, tax-exempt,"
            "or charitable organization"
        ), 
        3.0: "Local government employee (city, county, etc.)",
        4.0: "State government employee",
        5.0: "Federal government employee", 
        6.0: (
            "Self-employed in own not incorporated business,"
            "professional practice, or farm"
        ),
        7.0: (
            "Self-employed in own incorporated business,"
            "professional practice or farm"
        ),
        8.0: "Working without pay in family business or farm",
        9.0: "Unemployed and last worked 5 years ago or earlier or never worked",
    },
    "SCHL": {
        1.0: "No schooling completed",
        2.0: "Nursery school, preschool",
        3.0: "Kindergarten",
        4.0: "Grade 1",
        5.0: "Grade 2",
        6.0: "Grade 3",
        7.0: "Grade 4",
        8.0: "Grade 5",
        9.0: "Grade 6",
        10.0: "Grade 7",
        11.0: "Grade 8",
        12.0: "Grade 9",
        13.0: "Grade 10",
        14.0: "Grade 11",
        15.0: "12th grade - no diploma",
        16.0: "Regular high school diploma",
        17.0: "GED or alternative credential",
        18.0: "Some college, but less than 1 year",
        19.0: "1 or more years of college credit, no degree",
        20.0: "Associate's degree",
        21.0: "Bachelor's degree",
        22.0: "Master's degree",
        23.0: "Professional degree beyond a bachelor's degree",
        24.0: "Doctorate degree",
    },
    "MAR": {
        1.0: "Married",
        2.0: "Widowed",
        3.0: "Divorced",
        4.0: "Separated",
        5.0: "Never married or under 15 years old",
    },
    "SEX": {1.0: "Male", 2.0: "Female"},
    "RAC1P": {
        1.0: "White alone",
        2.0: "Black or African American alone",
        3.0: "American Indian alone",
        4.0: "Alaska Native alone",
        5.0: (
            "American Indian and Alaska Native tribes specified;"
            "or American Indian or Alaska Native,"
            "not specified and no other"
        ),
        6.0: "Asian alone",
        7.0: "Native Hawaiian and Other Pacific Islander alone",
        8.0: "Some Other Race alone",
        9.0: "Two or More Races",
    },
}

Plotting¶

In [3]:
#edited to include WAGP and disclude RELP and POBP
ACSIncome = folktables.BasicProblem(
    features=[
        'AGEP',
        'COW',
        'SCHL',
        'MAR',
        'OCCP',
        'WKHP',
        'SEX',
        'RAC1P',
        'WAGP',
        'HISP'
    ],
    target='PINCP',
    target_transform=lambda x: x > 50000,    
    group='RAC1P',
    preprocess=folktables.adult_filter,
    postprocess=lambda x: np.nan_to_num(x, -1),
)
In [4]:
data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
ca_data = data_source.get_data(states=["CA"], download=True)

ca_features1, ca_labels1, _ = ACSIncome.df_to_pandas(ca_data, categories=ACSIncome_categories, dummies=False)
In [5]:
ca_features1
Out[5]:
AGEP COW SCHL MAR OCCP WKHP SEX RAC1P WAGP HISP
0 30 Self-employed in own not incorporated business... Grade 11 Married 9610.0 40.0 Male Some Other Race alone 500.0 2
1 21 State government employee Regular high school diploma Never married or under 15 years old 1970.0 20.0 Male White alone 7700.0 1
2 65 Employee of a private not-for-profit, tax-exem... Master's degree Never married or under 15 years old 2040.0 8.0 Male White alone 5000.0 1
3 33 Employee of a private for-profit company orbus... Grade 11 Divorced 9610.0 40.0 Male White alone 12000.0 1
4 18 Employee of a private not-for-profit, tax-exem... 1 or more years of college credit, no degree Never married or under 15 years old 1021.0 18.0 Female White alone 300.0 7
... ... ... ... ... ... ... ... ... ... ...
195660 38 Employee of a private for-profit company orbus... Master's degree Married 1021.0 40.0 Male Asian alone 565000.0 1
195661 39 Employee of a private for-profit company orbus... Master's degree Married 1021.0 40.0 Female Asian alone 209000.0 1
195662 61 Employee of a private for-profit company orbus... 1 or more years of college credit, no degree Married 5240.0 45.0 Male White alone 105000.0 1
195663 69 Self-employed in own incorporated business,pro... Doctorate degree Married 2040.0 45.0 Male Asian alone 30000.0 1
195664 40 Employee of a private for-profit company orbus... GED or alternative credential Married 9600.0 40.0 Male Some Other Race alone 30000.0 2

195665 rows × 10 columns

In [6]:
asian=ca_features1[ca_features1['RAC1P']=='Asian alone']
black=ca_features1[ca_features1['RAC1P']=='Black or African American alone']
white=ca_features1[ca_features1['RAC1P']=='White alone']
hisp=ca_features1[ca_features1['RAC1P']=='Some Other Race alone']
plottable=pd.concat([asian, black, white, hisp])
plottable['AGEP'] = plottable['AGEP'].astype(int) #changing strings with numbers to integers
plottable['OCCP'] = plottable['OCCP'].astype(int)
plottable['WKHP'] = plottable['WKHP'].astype(int)
plottable['WAGP'] = plottable['WAGP'].astype(int)
plottable['HISP'] = plottable['HISP'].astype(int)
plottable["MAR"].replace({"Never married or under 15 years old": "Never married"}, inplace=True)
plottable["RAC1P"].replace({"Asian alone": "Asian"}, inplace=True)
plottable["RAC1P"].replace({"Black or African American alone": "Black"}, inplace=True)
plottable["RAC1P"].replace({"White alone": "White"}, inplace=True)
plottable["RAC1P"].replace({"Some Other Race alone": "Other"}, inplace=True)
plottable=plottable.rename(columns={"AGEP": "AGE", "RAC1P": "RACE", "WAGP": "SALARY"})
plottable=plottable[plottable['SALARY']<350000]
for i in range(3,25):
    plottable['HISP']=plottable['HISP'].replace(i,2,regex=True)
plottable["SCHL"].replace({"Bachelor\'s degree": "Bachelors degree",
                           "Regular high school diploma":"High School Diploma or Equivalent",
                           "GED or alternative credential":"High School Diploma or Equivalent",
                           "1 or more years of college credit, no degree":"Some College",
                           "Master's degree":"Graduate degree",
                           "Associate's degree":"Some College",
                           "Some college, but less than 1 year":"Some College",
                           "Professional degree beyond a bachelor\'s degree":"Graduate degree",
                           "12th grade - no diploma":"Some High School or below",
                           "Doctorate degree":"Graduate degree",
                           "No schooling completed" :"Some High School or below",
                           "Grade 6":"Some High School or below",
                           "Grade 11":"Some High School or below",
                           "Grade 9":"Some High School or below",
                           "Grade 10":"Some High School or below", 
                           "Grade 8":"Some High School or below",
                           "Grade 3":"Some High School or below",
                           "Grade 5":"Some High School or below",  
                           "Grade 7":"Some High School or below",
                           "Grade 4":"Some High School or below",
                           "Grade 2":"Some High School or below",
                           "Grade 1":"Some High School or below",
                           "Nursery school, preschool":"Some High School or below",
                           "Kindergarten":"Some High School or below" 
                          }, inplace=True)
plottable['COW'].replace({"Employee of a private for-profit company orbusiness, or of an individual, for wages,salary, or commissions":"Private Employee",    
                          "Self-employed in own not incorporated business,professional practice, or farm":"Self_Employed",
                          "Local government employee (city, county, etc.)":"Government Employee",
                          "Employee of a private not-for-profit, tax-exempt,or charitable organization":"Private Employee",
                          "State government employee":"Government Employee",
                          "Self-employed in own incorporated business,professional practice or farm":"Self_Employed",
                          "Federal government employee":"Government Employee",
                          "Working without pay in family business or farm":"Self_Employed"    
                          }, inplace=True)
plottable
Out[6]:
AGE COW SCHL MAR OCCP WKHP SEX RACE SALARY HISP
27 23 Private Employee Bachelors degree Never married 2545 20 Female Asian 4000 1
33 18 Private Employee High School Diploma or Equivalent Never married 9610 8 Female Asian 1500 1
49 18 Private Employee Some College Never married 725 12 Female Asian 1400 1
53 25 Government Employee Bachelors degree Never married 3870 40 Male Asian 13000 1
80 20 Private Employee High School Diploma or Equivalent Never married 725 18 Female Asian 650 1
... ... ... ... ... ... ... ... ... ... ...
195602 24 Private Employee Bachelors degree Never married 2300 40 Female Other 20000 2
195628 46 Private Employee High School Diploma or Equivalent Married 8740 40 Female Other 12000 2
195629 50 Private Employee Some High School or below Married 7340 50 Male Other 17900 2
195642 42 Private Employee Some High School or below Married 6260 40 Male Other 61000 2
195664 40 Private Employee High School Diploma or Equivalent Married 9600 40 Male Other 30000 2

182867 rows × 10 columns

In [7]:
plottable.describe().T
Out[7]:
count mean std min 25% 50% 75% max
AGE 182867.0 42.909169 14.899778 17.0 30.0 42.0 55.0 94.0
OCCP 182867.0 4036.304730 2637.664438 10.0 2016.0 4110.0 5550.0 9830.0
WKHP 182867.0 37.784111 12.926377 1.0 32.0 40.0 40.0 99.0
SALARY 182867.0 50260.777067 52382.233154 0.0 12500.0 35000.0 70000.0 329000.0
HISP 182867.0 1.331192 0.470643 1.0 1.0 1.0 2.0 2.0
In [8]:
freq_age=plottable['AGE'].value_counts().sort_index()
freq_age.to_frame()
## x-axis for the plot
x_data = freq_age.index.values
  
## y-axis as the gaussian
y_data = stats.norm.pdf(freq_age)
  
## plot data
plt.plot(freq_age)
plt.title('Distribution of Age in the Dataset')
plt.xlabel('Age')
Out[8]:
Text(0.5, 0, 'Age')
In [9]:
plt.grid()
plt.hist(plottable['AGE'], bins=78, alpha=0.8)
plt.title('Distribution of Age in the Dataset')
plt.xlabel('Age')
Out[9]:
Text(0.5, 0, 'Age')
In [10]:
plt.grid()
plt.hist(plottable['WKHP'], bins=20, alpha=0.8)
plt.title('Distribution of Hours Worked per Week in the Dataset')
plt.xlabel('Hours Worked per Week')
Out[10]:
Text(0.5, 0, 'Hours Worked per Week')
In [11]:
plt.grid()
plt.hist(plottable['SALARY'], bins=20, alpha=0.8)
plt.title('Distribution of Salary in the Dataset')
plt.xlabel('Salary')
Out[11]:
Text(0.5, 0, 'Salary')
In [12]:
plottable.boxplot(by ='RACE', column =['SALARY'], grid = True)
Out[12]:
<AxesSubplot:title={'center':'SALARY'}, xlabel='RACE'>
In [13]:
by_race_gender_mean = plottable.groupby(['RACE','SEX'],as_index=False).SALARY.mean()
by_race_gender_median = plottable.groupby(['RACE','SEX'],as_index=False).SALARY.median()
by_race_gender_mean=by_race_gender_mean.rename(columns={'SALARY':"Mean_Salary"})
by_race_gender_median=by_race_gender_median.rename(columns={'SALARY':"Median_Salary"})
print(by_race_gender_mean,"\n")
print(by_race_gender_median)
    RACE     SEX   Mean_Salary
0  Asian  Female  52864.154185
1  Asian    Male  67356.512708
2  Black  Female  40330.507380
3  Black    Male  44765.347056
4  Other  Female  27031.443645
5  Other    Male  35275.147723
6  White  Female  43438.414165
7  White    Male  59038.678868 

    RACE     SEX  Median_Salary
0  Asian  Female        39000.0
1  Asian    Male        50000.0
2  Black  Female        30000.0
3  Black    Male        31100.0
4  Other  Female        21400.0
5  Other    Male        30000.0
6  White  Female        30000.0
7  White    Male        40900.0
In [14]:
fig, ax = plt.subplots(figsize=(12, 8))
x = np.arange(len(by_race_gender_mean.RACE.unique()))

# Define bar width. We'll use this to offset the second bar.

# Note we add the `width` parameter now which sets the width of each bar.
b1 = ax.bar(x, by_race_gender_mean.loc[by_race_gender_mean['SEX'] == 'Male', 'Mean_Salary'], width=0.4, label='Male')
# Same thing, but offset the x by the width of the bar.
b2 = ax.bar(x + 0.4, by_race_gender_mean.loc[by_race_gender_mean['SEX'] == 'Female', 'Mean_Salary'], width=0.4, label='Female')
# Fix the x-axes.
ax.set_xticks(x + 0.2)
ax.set_xticklabels(by_race_gender_mean.RACE.unique())

ax.set_ylabel("Salary in $")
ax.bar_label(b1, padding=3)
ax.bar_label(b2, padding=3)
ax.set_title("Mean Salary by Sex and Race")
ax.legend()
Out[14]:
<matplotlib.legend.Legend at 0x293fc20bb20>
In [15]:
fig, axx = plt.subplots(figsize=(12, 8))
x = np.arange(len(by_race_gender_median.RACE.unique()))

# Define bar width. We'll use this to offset the second bar.
bar_width = 0.4

# Note we add the `width` parameter now which sets the width of each bar.
b1 = axx.bar(x, by_race_gender_median.loc[by_race_gender_median['SEX'] == 'Male', 'Median_Salary'],
            width=bar_width, label='Male')
# Same thing, but offset the x by the width of the bar.
b2 = axx.bar(x + bar_width, by_race_gender_median.loc[by_race_gender_median['SEX'] == 'Female', 'Median_Salary'],
            width=bar_width, label="Female")
# Fix the x-axes.
axx.set_xticks(x + bar_width / 2)
axx.set_xticklabels(by_race_gender_median.RACE.unique())
axx.bar_label(b1, padding=3)
axx.bar_label(b2, padding=3)
axx.set_ylabel("Salary in $")

axx.set_title("Median Salary by Sex and Race")
axx.legend()
Out[15]:
<matplotlib.legend.Legend at 0x293fc286590>
In [16]:
import seaborn as sns
sns.countplot(x="RACE", hue="HISP", data=plottable);
plt.title('Hispanic population by race');
plt.legend(bbox_to_anchor=(.68, 1.0), loc='upper left', labels = ['Non-Hispanic','Hispanic'])
Out[16]:
<matplotlib.legend.Legend at 0x293fc229a20>
In [17]:
salary_fit = pd.cut(plottable.SALARY, bins = [0,plottable['SALARY'].median(),350001], labels= [True, False])
plottable.insert(10, 'Salary_below_median', salary_fit)
plottable
Out[17]:
AGE COW SCHL MAR OCCP WKHP SEX RACE SALARY HISP Salary_below_median
27 23 Private Employee Bachelors degree Never married 2545 20 Female Asian 4000 1 True
33 18 Private Employee High School Diploma or Equivalent Never married 9610 8 Female Asian 1500 1 True
49 18 Private Employee Some College Never married 725 12 Female Asian 1400 1 True
53 25 Government Employee Bachelors degree Never married 3870 40 Male Asian 13000 1 True
80 20 Private Employee High School Diploma or Equivalent Never married 725 18 Female Asian 650 1 True
... ... ... ... ... ... ... ... ... ... ... ...
195602 24 Private Employee Bachelors degree Never married 2300 40 Female Other 20000 2 True
195628 46 Private Employee High School Diploma or Equivalent Married 8740 40 Female Other 12000 2 True
195629 50 Private Employee Some High School or below Married 7340 50 Male Other 17900 2 True
195642 42 Private Employee Some High School or below Married 6260 40 Male Other 61000 2 False
195664 40 Private Employee High School Diploma or Equivalent Married 9600 40 Male Other 30000 2 True

182867 rows × 11 columns

In [18]:
ACSIncome_categories_1 = {
    "COW": {"Private Employee":1,
            "Government Employee":2,
            "Self_Employed":3}, 
    "SCHL": {"Some High School or below":1,
             "High School Diploma or Equivalent":2,
             "Some College":3,
             "Bachelors degree":4,
             "Graduate degree":5},
    "MAR": { "Married":1,
             "Widowed":2,
             "Divorced":3,
             "Separated":4,
             "Never married":5},
    "SEX": { "Male":1, 
             "Female":2},
    "RACE": {"White":1,
             "Black":2,
             "Asian":3,
             "Other":4,
    },
}
In [19]:
obj_df = plottable.replace(ACSIncome_categories_1)
obj_df
Out[19]:
AGE COW SCHL MAR OCCP WKHP SEX RACE SALARY HISP Salary_below_median
27 23 1 4 5 2545 20 2 3 4000 1 True
33 18 1 2 5 9610 8 2 3 1500 1 True
49 18 1 3 5 725 12 2 3 1400 1 True
53 25 2 4 5 3870 40 1 3 13000 1 True
80 20 1 2 5 725 18 2 3 650 1 True
... ... ... ... ... ... ... ... ... ... ... ...
195602 24 1 4 5 2300 40 2 4 20000 2 True
195628 46 1 2 1 8740 40 2 4 12000 2 True
195629 50 1 1 1 7340 50 1 4 17900 2 True
195642 42 1 1 1 6260 40 1 4 61000 2 False
195664 40 1 2 1 9600 40 1 4 30000 2 True

182867 rows × 11 columns

In [20]:
obj_df = obj_df[obj_df['Salary_below_median'].notna()]
#obj_df['Salary_below_median']=obj_df['Salary_below_median'].dropna()
obj_df['Salary_below_median'].isna().sum()
Out[20]:
0
In [21]:
obj_df
Out[21]:
AGE COW SCHL MAR OCCP WKHP SEX RACE SALARY HISP Salary_below_median
27 23 1 4 5 2545 20 2 3 4000 1 True
33 18 1 2 5 9610 8 2 3 1500 1 True
49 18 1 3 5 725 12 2 3 1400 1 True
53 25 2 4 5 3870 40 1 3 13000 1 True
80 20 1 2 5 725 18 2 3 650 1 True
... ... ... ... ... ... ... ... ... ... ... ...
195602 24 1 4 5 2300 40 2 4 20000 2 True
195628 46 1 2 1 8740 40 2 4 12000 2 True
195629 50 1 1 1 7340 50 1 4 17900 2 True
195642 42 1 1 1 6260 40 1 4 61000 2 False
195664 40 1 2 1 9600 40 1 4 30000 2 True

168541 rows × 11 columns

In [22]:
train,test=obj_df.drop(['SALARY','Salary_below_median'],axis=1), obj_df['Salary_below_median']
X_train, X_test, y_train, y_test = train_test_split(train,test, test_size=0.2, random_state=np.random.seed())
y_test=y_test.astype('int')
y_train=y_train.astype('int')

Binary Classifier (non Fair-ML)¶

In [23]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train,y_train)
dtc_pred=dtc.predict(X_test)

clf = SGDClassifier()
# fit (train) the classifier
clf.fit(X_train, y_train)
clf_pred=clf.predict(X_test)

rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)

gbc = GradientBoostingClassifier(n_estimators=100)
gbc.fit(X_train, y_train)
gbc_pred = gbc.predict(X_test)

xgb = xg.XGBClassifier()
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)


clf_acc = accuracy_score(y_test, clf_pred)
print("Linear Accuracy: " + str(clf_acc))

rfc_acc = accuracy_score(y_test, rfc_pred)
print("Random Forest Accuracy: " + str(rfc_acc))

gbc_acc = accuracy_score(y_test, gbc_pred)
print("Gradient Boosting Accuracy: " + str(gbc_acc))

dtc_acc = accuracy_score(y_test, dtc_pred)
print("Decision Tree Accuracy: "+ str(dtc_acc))

xgb_acc = accuracy_score(y_test, xgb_pred)
print("XGBoost Accuracy: " + str(xgb_acc))
Linear Accuracy: 0.7598267524993325
Random Forest Accuracy: 0.8059865317867632
Gradient Boosting Accuracy: 0.8150938918389747
Decision Tree Accuracy: 0.7603310688540152
XGBoost Accuracy: 0.8237562668723486
In [24]:
imp_clf = rfpimp.importances(clf, X_test, y_test)
imp_clf
Out[24]:
Importance
Feature
WKHP 0.1038
MAR 0.0390
SCHL 0.0342
OCCP 0.0232
HISP 0.0090
RACE 0.0070
SEX 0.0036
AGE 0.0010
COW -0.0018
In [25]:
imp_rfc = rfpimp.importances(rfc, X_test, y_test)
imp_rfc
Out[25]:
Importance
Feature
WKHP 0.1268
OCCP 0.0744
AGE 0.0456
SCHL 0.0282
SEX 0.0150
HISP 0.0086
COW 0.0078
RACE 0.0062
MAR 0.0044
In [26]:
imp_gbc = rfpimp.importances(gbc, X_test, y_test)
imp_gbc
Out[26]:
Importance
Feature
WKHP 0.1260
AGE 0.0570
OCCP 0.0444
SCHL 0.0344
SEX 0.0090
MAR 0.0054
HISP 0.0044
COW 0.0028
RACE 0.0014

Fair ML Attempt¶

In [27]:
exp_lin = dx.Explainer(clf, X_test, y_test, verbose=False)
exp_rfg = dx.Explainer(rfc, X_test, y_test, verbose=False)
exp_gbc = dx.Explainer(gbc, X_test, y_test, verbose=False)
exp_dtc = dx.Explainer(dtc, X_test, y_test, verbose=False)
exp_xgb = dx.Explainer(xgb, X_test, y_test, verbose=False)
In [28]:
exp_lin.model_performance().result.append(exp_rfg.model_performance().result).append(exp_gbc.model_performance().result).append(exp_dtc.model_performance().result).append(exp_xgb.model_performance().result)
Out[28]:
recall precision f1 accuracy auc
SGDClassifier 0.660011 0.792164 0.720075 0.759827 0.753829
RandomForestClassifier 0.774102 0.803645 0.788597 0.805749 0.882478
GradientBoostingClassifier 0.765988 0.826268 0.794987 0.815094 0.894318
DecisionTreeClassifier 0.753058 0.734256 0.743538 0.756860 0.763279
XGBClassifier 0.783672 0.830244 0.806286 0.823756 0.904609

Using One-Hot Encoded Data¶

In [29]:
ACSIncome_categories = {
    "COW": {
        1.0: (
            "Employee of a private for-profit company or"
            "business, or of an individual, for wages,"
            "salary, or commissions"
        ),
        2.0: (
            "Employee of a private not-for-profit, tax-exempt,"
            "or charitable organization"
        ), 
        3.0: "Local government employee (city, county, etc.)",
        4.0: "State government employee",
        5.0: "Federal government employee", 
        6.0: (
            "Self-employed in own not incorporated business,"
            "professional practice, or farm"
        ),
        7.0: (
            "Self-employed in own incorporated business,"
            "professional practice or farm"
        ),
        8.0: "Working without pay in family business or farm",
        9.0: "Unemployed and last worked 5 years ago or earlier or never worked",
    },
    "SCHL": {
        1.0: "No schooling completed",
        2.0: "Nursery school, preschool",
        3.0: "Kindergarten",
        4.0: "Grade 1",
        5.0: "Grade 2",
        6.0: "Grade 3",
        7.0: "Grade 4",
        8.0: "Grade 5",
        9.0: "Grade 6",
        10.0: "Grade 7",
        11.0: "Grade 8",
        12.0: "Grade 9",
        13.0: "Grade 10",
        14.0: "Grade 11",
        15.0: "12th grade - no diploma",
        16.0: "Regular high school diploma",
        17.0: "GED or alternative credential",
        18.0: "Some college, but less than 1 year",
        19.0: "1 or more years of college credit, no degree",
        20.0: "Associate's degree",
        21.0: "Bachelor's degree",
        22.0: "Master's degree",
        23.0: "Professional degree beyond a bachelor's degree",
        24.0: "Doctorate degree",
    },
    "MAR": {
        1.0: "Married",
        2.0: "Widowed",
        3.0: "Divorced",
        4.0: "Separated",
        5.0: "Never married or under 15 years old",
    },
    "SEX": {1.0: "Male", 2.0: "Female"},
    "RAC1P": {
        1.0: "White alone",
        2.0: "Black or African American alone",
        3.0: "American Indian alone",
        4.0: "Alaska Native alone",
        5.0: (
            "American Indian and Alaska Native tribes specified;"
            "or American Indian or Alaska Native,"
            "not specified and no other"
        ),
        6.0: "Asian alone",
        7.0: "Native Hawaiian and Other Pacific Islander alone",
        8.0: "Some Other Race alone",
        9.0: "Two or More Races",
    },
}
In [30]:
ACSIncome = folktables.BasicProblem(
    features=[
        'AGEP',
        'COW',
        'SCHL',
        'MAR',
        'OCCP',
        'WKHP',
        'SEX',
        'RAC1P',
        'WAGP',
        'HISP'
    ],
    target='PINCP',
    target_transform=lambda x: x > 50000,    
    group='RAC1P',
    preprocess=folktables.adult_filter,
    postprocess=lambda x: np.nan_to_num(x, -1),
)
In [31]:
data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
ca_data = data_source.get_data(states=["CA"], download=True)

ca_features, ca_labels, _ = ACSIncome.df_to_pandas(ca_data, categories=ACSIncome_categories, dummies=True)
ca_features
Out[31]:
AGEP OCCP WKHP WAGP HISP COW_Employee of a private for-profit company orbusiness, or of an individual, for wages,salary, or commissions COW_Employee of a private not-for-profit, tax-exempt,or charitable organization COW_Federal government employee COW_Local government employee (city, county, etc.) COW_Self-employed in own incorporated business,professional practice or farm ... SEX_Male RAC1P_Alaska Native alone RAC1P_American Indian alone RAC1P_American Indian and Alaska Native tribes specified;or American Indian or Alaska Native,not specified and no other RAC1P_Asian alone RAC1P_Black or African American alone RAC1P_Native Hawaiian and Other Pacific Islander alone RAC1P_Some Other Race alone RAC1P_Two or More Races RAC1P_White alone
0 30.0 9610.0 40.0 500.0 2.0 0.0 0.0 0.0 0.0 0.0 ... 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
1 21.0 1970.0 20.0 7700.0 1.0 0.0 0.0 0.0 0.0 0.0 ... 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
2 65.0 2040.0 8.0 5000.0 1.0 0.0 1.0 0.0 0.0 0.0 ... 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
3 33.0 9610.0 40.0 12000.0 1.0 1.0 0.0 0.0 0.0 0.0 ... 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
4 18.0 1021.0 18.0 300.0 7.0 0.0 1.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
195660 38.0 1021.0 40.0 565000.0 1.0 1.0 0.0 0.0 0.0 0.0 ... 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
195661 39.0 1021.0 40.0 209000.0 1.0 1.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
195662 61.0 5240.0 45.0 105000.0 1.0 1.0 0.0 0.0 0.0 0.0 ... 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
195663 69.0 2040.0 45.0 30000.0 1.0 0.0 0.0 0.0 0.0 1.0 ... 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
195664 40.0 9600.0 40.0 30000.0 2.0 1.0 0.0 0.0 0.0 0.0 ... 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0

195665 rows × 53 columns

In [32]:
asian=ca_features[ca_features['RAC1P_Asian alone']==1]
black=ca_features[ca_features['RAC1P_Black or African American alone']==1]
white=ca_features[ca_features['RAC1P_White alone']==1]
other=ca_features[ca_features['RAC1P_Some Other Race alone']==1]
plottable1=pd.concat([asian, black, white, other])
plottable1['AGEP'] = plottable1['AGEP'].astype(int) #changing strings with numbers to integers
plottable1['OCCP'] = plottable1['OCCP'].astype(int)
plottable1['WKHP'] = plottable1['WKHP'].astype(int)
plottable1['WAGP'] = plottable1['WAGP'].astype(int)
plottable1['HISP'] = plottable1['HISP'].astype(int)
#plottable["MAR"].replace({"Never married or under 15 years old": "Never married"}, inplace=True)
plottable1=plottable1.rename({"WKHP":"Weekly_Hours_Worked","WAGP":"SALARY","AGEP":"AGE","OCCP":"OCCUPATION","MAR_Never married or under 15 years old": "MAR_Never married","RAC1P_Asian alone": "RACE_Asian", "RAC1P_Black or African American alone": "RACE_Black","RAC1P_White alone": "RACE_White", "RAC1P_Some Other Race alone": "RACE_Other"}, axis=1)
plottable1=plottable1[plottable1['SALARY']<350000]
plottable1=plottable1.drop(['RAC1P_Two or More Races','RAC1P_Native Hawaiian and Other Pacific Islander alone','RAC1P_Alaska Native alone','RAC1P_American Indian alone','RAC1P_American Indian and Alaska Native tribes specified;or American Indian or Alaska Native,not specified and no other'],axis=1)
plottable1['SCHL_Some High School or below']=plottable1['SCHL_12th grade - no diploma']+plottable1['SCHL_Grade 11']+plottable1['SCHL_Grade 10']+plottable1['SCHL_Grade 9']+plottable1['SCHL_Grade 8']+plottable1['SCHL_Grade 7']+plottable1['SCHL_Grade 6']+plottable1['SCHL_Grade 5']+plottable1['SCHL_Grade 4']+plottable1['SCHL_Kindergarten']+plottable1['SCHL_Grade 3']+plottable1['SCHL_Grade 2']+plottable1['SCHL_Grade 1']+plottable1['SCHL_No schooling completed']+plottable1['SCHL_Nursery school, preschool']
plottable1=plottable1.drop(['SCHL_12th grade - no diploma','SCHL_Grade 11','SCHL_Grade 10','SCHL_Grade 9','SCHL_Grade 8','SCHL_Grade 7','SCHL_Grade 6','SCHL_Grade 5','SCHL_Grade 4','SCHL_Grade 3','SCHL_Grade 2','SCHL_Grade 1','SCHL_No schooling completed','SCHL_Nursery school, preschool','SCHL_Kindergarten'],axis=1)
plottable1['SCHL_High School Diploma or Equivalent']=plottable1['SCHL_Regular high school diploma']+plottable1['SCHL_GED or alternative credential']
plottable1=plottable1.drop(['SCHL_Regular high school diploma','SCHL_GED or alternative credential'],axis=1)
plottable1['SCHL_Some College']=plottable1['SCHL_1 or more years of college credit, no degree']+plottable1['SCHL_Associate\'s degree']+plottable1['SCHL_Some college, but less than 1 year']
plottable1=plottable1.drop(['SCHL_1 or more years of college credit, no degree','SCHL_Associate\'s degree','SCHL_Some college, but less than 1 year'],axis=1)
plottable1['SCHL_Bachelors degree']=plottable1['SCHL_Bachelor\'s degree']
plottable1['SCHL_Graduate degree']=plottable1['SCHL_Professional degree beyond a bachelor\'s degree']+plottable1['SCHL_Master\'s degree']+plottable1['SCHL_Doctorate degree']
plottable1=plottable1.drop(['SCHL_Bachelor\'s degree','SCHL_Professional degree beyond a bachelor\'s degree','SCHL_Master\'s degree','SCHL_Doctorate degree'],axis=1)
plottable1['COW_Private Employee']=plottable1['COW_Employee of a private not-for-profit, tax-exempt,or charitable organization']+plottable1[ 'COW_Employee of a private for-profit company orbusiness, or of an individual, for wages,salary, or commissions']
#plottable=plottable.drop([ 'COW_Employee of a private for-profit company orbusiness, or of an individual, for wages,salary, or commissions','COW_Employee of a private not-for-profit, tax-exempt,or charitable organization'],axis=1)
plottable1['COW_Government Employee']=plottable1['COW_Federal government employee']+plottable1['COW_Local government employee (city, county, etc.)']+plottable1['COW_State government employee']
plottable1['COW_Self_Employed']=plottable1['COW_Working without pay in family business or farm']+plottable1['COW_Self-employed in own not incorporated business,professional practice, or farm']+plottable1['COW_Self-employed in own not incorporated business,professional practice, or farm']
plottable1=plottable1.drop(['COW_Working without pay in family business or farm','COW_Employee of a private for-profit company orbusiness, or of an individual, for wages,salary, or commissions','COW_Employee of a private not-for-profit, tax-exempt,or charitable organization','COW_Federal government employee','COW_Local government employee (city, county, etc.)','COW_Self-employed in own incorporated business,professional practice or farm', 'COW_Self-employed in own not incorporated business,professional practice, or farm', 'COW_State government employee'],axis=1)
for i in range(3,25):
    plottable1['HISP']=plottable1['HISP'].replace(i,2,regex=True)
plottable1
Out[32]:
AGE OCCUPATION Weekly_Hours_Worked SALARY HISP MAR_Divorced MAR_Married MAR_Never married MAR_Separated MAR_Widowed ... RACE_Other RACE_White SCHL_Some High School or below SCHL_High School Diploma or Equivalent SCHL_Some College SCHL_Bachelors degree SCHL_Graduate degree COW_Private Employee COW_Government Employee COW_Self_Employed
27 23 2545 20 4000 1 0.0 0.0 1.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0
33 18 9610 8 1500 1 0.0 0.0 1.0 0.0 0.0 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0
49 18 725 12 1400 1 0.0 0.0 1.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0
53 25 3870 40 13000 1 0.0 0.0 1.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0
80 20 725 18 650 1 0.0 0.0 1.0 0.0 0.0 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
195602 24 2300 40 20000 2 0.0 0.0 1.0 0.0 0.0 ... 1.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0
195628 46 8740 40 12000 2 0.0 1.0 0.0 0.0 0.0 ... 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0
195629 50 7340 50 17900 2 0.0 1.0 0.0 0.0 0.0 ... 1.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
195642 42 6260 40 61000 2 0.0 1.0 0.0 0.0 0.0 ... 1.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
195664 40 9600 40 30000 2 0.0 1.0 0.0 0.0 0.0 ... 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0

182867 rows × 24 columns

In [33]:
salary_fit = pd.cut(plottable1.SALARY, bins = [0,plottable1['SALARY'].median(),350001], labels= [True, False])
plottable1.insert(10, 'Salary_below_median', salary_fit)
plottable1
Out[33]:
AGE OCCUPATION Weekly_Hours_Worked SALARY HISP MAR_Divorced MAR_Married MAR_Never married MAR_Separated MAR_Widowed ... RACE_Other RACE_White SCHL_Some High School or below SCHL_High School Diploma or Equivalent SCHL_Some College SCHL_Bachelors degree SCHL_Graduate degree COW_Private Employee COW_Government Employee COW_Self_Employed
27 23 2545 20 4000 1 0.0 0.0 1.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0
33 18 9610 8 1500 1 0.0 0.0 1.0 0.0 0.0 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0
49 18 725 12 1400 1 0.0 0.0 1.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0
53 25 3870 40 13000 1 0.0 0.0 1.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0
80 20 725 18 650 1 0.0 0.0 1.0 0.0 0.0 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
195602 24 2300 40 20000 2 0.0 0.0 1.0 0.0 0.0 ... 1.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0
195628 46 8740 40 12000 2 0.0 1.0 0.0 0.0 0.0 ... 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0
195629 50 7340 50 17900 2 0.0 1.0 0.0 0.0 0.0 ... 1.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
195642 42 6260 40 61000 2 0.0 1.0 0.0 0.0 0.0 ... 1.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
195664 40 9600 40 30000 2 0.0 1.0 0.0 0.0 0.0 ... 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0

182867 rows × 25 columns

In [34]:
plottable1 = plottable1[plottable1['Salary_below_median'].notna()]
In [35]:
train1,test1=plottable1.drop(['SALARY','Salary_below_median'],axis=1), plottable1['Salary_below_median']
#X_train, X_test, y_train, y_test = train_test_split(train1,test1, test_size=0.2, random_state=np.random.seed())
X_train1, X_test1, y_train1, y_test1 = train_test_split(train1, test1, test_size=0.2, random_state=np.random.seed())
y_train1=y_train1.astype(int)
y_test1=y_test1.astype(int)

dtc_one = DecisionTreeClassifier()
dtc_one.fit(X_train1,y_train1)
dtc_one_pred=dtc_one.predict(X_test1)

clf_one = SGDClassifier()
clf_one.fit(X_train1, y_train1)
clf_one_pred=clf_one.predict(X_test1)

rfc_one = RandomForestClassifier(n_estimators=100)
rfc_one.fit(X_train1, y_train1)
rfc_one_pred = rfc_one.predict(X_test1)

gbc_one = GradientBoostingClassifier(n_estimators=100)
gbc_one.fit(X_train1, y_train1)
gbc_one_pred = gbc_one.predict(X_test1)

xgb_one = xg.XGBClassifier()
xgb_one.fit(X_train1, y_train1)
xgb_one_pred = xgb_one.predict(X_test1)

clf_one_acc = accuracy_score(y_test1, clf_one_pred)
print("Linear Accuracy: " + str(clf_one_acc))

rfc_one_acc = accuracy_score(y_test1, rfc_one_pred)
print("Random Forest Accuracy: " + str(rfc_one_acc))

gbc_one_acc = accuracy_score(y_test1, gbc_one_pred)
print("Gradient Boosting Accuracy: " + str(gbc_one_acc))

dtc_one_acc = accuracy_score(y_test1, dtc_one_pred)
print("Decision Tree Accuracy: " + str(dtc_one_acc))

xgb_one_acc = accuracy_score(y_test1, xgb_one_pred)
print("XGBoost Accuracy: " + str(xgb_one_acc))
Linear Accuracy: 0.6527930226349046
Random Forest Accuracy: 0.8029013023228218
Gradient Boosting Accuracy: 0.8155982081936575
Decision Tree Accuracy: 0.7627933192915839
XGBoost Accuracy: 0.827998457385268
In [36]:
exp_clf1 = dx.Explainer(clf_one, X_test1, y_test1, verbose=False)
exp_rfc1 = dx.Explainer(rfc_one, X_test1, y_test1, verbose=False)
exp_gbc1 = dx.Explainer(gbc_one, X_test1, y_test1, verbose=False)
exp_dtc1 = dx.Explainer(dtc_one, X_test1, y_test1, verbose=False)
exp_xgb1 = dx.Explainer(xgb_one, X_test1, y_test1, verbose=False)
In [37]:
exp_clf1.model_performance().result.append(exp_rfc1.model_performance().result).append(exp_gbc1.model_performance().result).append(exp_dtc1.model_performance().result).append(exp_xgb1.model_performance().result)
Out[37]:
recall precision f1 accuracy auc
SGDClassifier 0.939961 0.579525 0.716994 0.652793 0.670109
RandomForestClassifier 0.774361 0.797675 0.785845 0.802516 0.878851
GradientBoostingClassifier 0.766310 0.826914 0.795459 0.815598 0.895566
DecisionTreeClassifier 0.752045 0.741236 0.746601 0.761132 0.766918
XGBClassifier 0.787675 0.835339 0.810807 0.827998 0.907197

Fair ML¶

In [38]:
plottable = plottable[plottable['Salary_below_median'].notna()]
X = plottable.drop(columns=['SALARY','Salary_below_median'], axis=1)
y = plottable.Salary_below_median

X_train_fair,X_test_fair,y_train_fair,y_test_fair,race_train,race_test=train_test_split(X, y,X['RACE'], test_size=0.2, random_state=np.random.seed())

categorical_features = ['SEX', 'COW', 'MAR', 'RACE', "SCHL"]
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_features=['AGE', 'OCCP','WKHP','HISP']
numerical_transformer = Pipeline(steps=[
    ('scale', StandardScaler())
])

preprocessor = ColumnTransformer(transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features)
    
])

classifier = RandomForestClassifier(max_depth=10,n_estimators=20,random_state=np.random.seed())

clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', classifier)
])
y_train_fair=y_train_fair.astype('int')
y_test_fair=y_test_fair.astype('int')
clf.fit(X_train_fair, y_train_fair)
unaware_y_preds=clf.predict(X_test_fair)
In [39]:
exp = dx.Explainer(clf, X_test_fair, y_test_fair, label='Random Forest Bias Unaware', verbose=True)
exp.model_performance()
Preparation of a new explainer is initiated

  -> data              : 33709 rows 9 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 33709 values
  -> model_class       : sklearn.ensemble._forest.RandomForestClassifier (default)
  -> label             : Random Forest Bias Unaware
  -> predict function  : <function yhat_proba_default at 0x00000293D51C3370> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.0369, mean = 0.472, max = 0.995
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.98, mean = 0.00326, max = 0.957
  -> model_info        : package sklearn

A new explainer has been created!
Out[39]:
recall precision f1 accuracy auc
Random Forest Bias Unaware 0.766958 0.819584 0.792398 0.809131 0.889062
In [40]:
exp.model_parts().plot()
In [41]:
fair_tree=exp.model_fairness(protected=race_test, privileged="White")
fair_tree.metric_scores
Out[41]:
TPR TNR PPV NPV FNR FPR FDR FOR ACC STP
Asian 0.674 0.905 0.826 0.806 0.326 0.095 0.174 0.194 0.812 0.327
Black 0.733 0.840 0.837 0.736 0.267 0.160 0.163 0.264 0.783 0.464
Other 0.875 0.595 0.798 0.723 0.125 0.405 0.202 0.277 0.776 0.709
White 0.763 0.862 0.824 0.811 0.237 0.138 0.176 0.189 0.816 0.425
In [42]:
y_test_fair.groupby(race_test).mean()
Out[42]:
RACE
Asian    0.400426
Black    0.529595
Other    0.646511
White    0.458866
Name: Salary_below_median, dtype: float64
In [43]:
pd.Series(unaware_y_preds,index=y_test_fair.index).groupby(race_test).mean()
Out[43]:
RACE
Asian    0.326996
Black    0.463551
Other    0.709130
White    0.425134
dtype: float64
In [44]:
fair_tree.fairness_check()
Bias detected in 2 metrics: FPR, STP

Conclusion: your model is not fair because 2 or more criteria exceeded acceptable limits set by epsilon.

Ratios of metrics, based on 'White'. Parameter 'epsilon' was set to 0.8 and therefore metrics should be within (0.8, 1.25)
            TPR       ACC       PPV       FPR       STP
Asian  0.883355  0.995098  1.002427  0.688406  0.769412
Black  0.960682  0.959559  1.015777  1.159420  1.091765
Other  1.146789  0.950980  0.968447  2.934783  1.668235
In [45]:
fair_tree.plot()
In [46]:
from sklearn.preprocessing import PowerTransformer
from sklearn.base import BaseEstimator, TransformerMixin
In [47]:
class NormalizeColumnByLabel(BaseEstimator, TransformerMixin):
    def __init__(self,col,label):
        self.col=col
        self.label=label
        self.transformers = {}
        
    def fit(self,X,y=None):
        for group in X[self.label].unique():
            self.transformers[group] = PowerTransformer( method='yeo-johnson',standardize=True)
            self.transformers[group].fit(
                X.loc[X[self.label]==group][self.col].values.reshape(-1,1))
        return self
    
    def transform(self,X,y=None):
        C=X.copy()
        for group in X[self.label].unique():
            C.loc[X[self.label]==group, self.col] = self.transformers[group].transform(X.loc[X[self.label]==group][self.col].values.reshape(-1,1))
        return C
In [48]:
n=NormalizeColumnByLabel(col='WKHP',label='RACE')

X_train_norm=n.fit_transform(X_train_fair, y_train_fair)

X_train_norm.groupby('RACE')['WKHP'].describe()
Out[48]:
count mean std min 25% 50% 75% max
RACE
Asian 24373.0 -1.273980e-16 1.000021 -2.780694 -0.191703 0.144517 0.144517 5.544391
Black 6447.0 -3.526814e-16 1.000078 -2.814527 -0.433156 0.179557 0.179557 4.689407
Other 16964.0 -9.005346e-18 1.000029 -3.086510 -0.267739 0.181157 0.181157 5.764106
White 87048.0 1.935365e-16 1.000006 -2.728541 -0.284411 0.109967 0.109967 5.059046
In [49]:
clf_aware = Pipeline(steps=[
    ('normalize_priors', NormalizeColumnByLabel(col='WKHP',label='RACE')),
    ('preprocessor',preprocessor),
    ('classifier',classifier)
])

clf_aware.fit(X_train_fair,y_train_fair)
aware_y_preds = clf_aware.predict(X_test_fair)
In [50]:
exp_aware = dx.Explainer(clf_aware, X_test_fair,y_test_fair, label='Random Forest DIR', verbose=False)
mf_aware=exp_aware.model_fairness(protected=race_test, privileged="White")
In [51]:
pd.concat([exp1.model_performance().result for exp1 in [exp,exp_aware]])
Out[51]:
recall precision f1 accuracy auc
Random Forest Bias Unaware 0.766958 0.819584 0.792398 0.809131 0.889062
Random Forest DIR 0.760899 0.821277 0.789936 0.807796 0.889296
In [52]:
fair_tree.plot(objects=[mf_aware],type='stacked')
In [53]:
from aif360.algorithms.preprocessing.lfr import LFR
from aif360.datasets import BinaryLabelDataset

class LFRCustom(BaseEstimator, TransformerMixin):
    def __init__(self,col,protected_col,unprivileged_groups,privileged_groups):
        self.col = col
        self.protected_col= protected_col
        self.TR = None
        self.unprivileged_groups = unprivileged_groups
        self.privileged_grous = privileged_groups
        
    def fit(self,X,y=None):
        d = pd.DataFrame(X, columns = self.col)
        d['response'] = list(y)
        
        binary_df = BinaryLabelDataset(
            df=d,
            protected_attribute_names = self.protected_col,
            label_names=['response']
        )
        
        self.TR = LFR(unprivileged_groups=self.unprivileged_groups,
                     privileged_groups = self.privileged_groups, seed=0,
                     k=2, Ax=0.5, Az=0.2,
                     verbose=1)
        
        self.TR.fit(binary_df,maxiter=5000,maxfun=5000)
        return self
    
    def transform(self,X, y=None):
        d = pd.DataFrame(X, columns=self.col)
        if y:
            d['response'] = list(y)
        else:
            d['response'] = False
            
        binary_df=BinaryLabelDataset(
            df=d,
            protected_attribute_names = self.protected_col,
            label_names=['response']
        )
        return self.TR.transform(binary_df).convert_to_dataframe()[0].drop(['response'], axis=1)
In [54]:
plottable = plottable[plottable['Salary_below_median'].notna()]
X = plottable.drop(columns=['SALARY','Salary_below_median'], axis=1)
y = plottable.Salary_below_median

categorical_features = ['SEX', 'COW', 'MAR', 'RACE', "SCHL"]
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
        ('cat', categorical_transformer, categorical_features)
])

clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(max_depth=7, random_state=123))
])
y=y.astype('int')
clf.fit(X, y)

exp = dx.Explainer(clf, X, y)
Preparation of a new explainer is initiated

  -> data              : 168541 rows 9 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 168541 values
  -> model_class       : sklearn.tree._classes.DecisionTreeClassifier (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_proba_default at 0x00000293D51C3370> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.0, mean = 0.47, max = 1.0
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.957, mean = -5.48e-18, max = 0.875
  -> model_info        : package sklearn

A new explainer has been created!
In [55]:
protected = plottable.SEX + '_' + np.where(plottable.AGE < 35, 'young', 'old')
privileged = 'Male_old'

fobject = exp.model_fairness(protected = protected, privileged=privileged)
In [56]:
fobject.fairness_check(epsilon = 0.8)
Bias detected in 4 metrics: TPR, PPV, FPR, STP

Conclusion: your model is not fair because 2 or more criteria exceeded acceptable limits set by epsilon.

Ratios of metrics, based on 'Male_old'. Parameter 'epsilon' was set to 0.8 and therefore metrics should be within (0.8, 1.25)
                   TPR       ACC       PPV       FPR       STP
Female_old    1.178330  0.925414  1.210909  1.352564  1.446281
Female_young  1.629797  1.020718  1.585455  1.442308  2.326446
Male_young    1.708804  1.029006  1.489091  1.762821  2.376033
In [57]:
protected1 = plottable.RACE + '_' + np.where(plottable.SEX == 'Male', 'Male', 'Female')
privileged1 = 'White_Male'

fobject = exp.model_fairness(protected = protected1, privileged=privileged1)
In [58]:
protected2 = plottable.RACE
privileged2 = 'White'

fobject = exp.model_fairness(protected = protected2, privileged=privileged2)
In [59]:
fobject.fairness_check(epsilon = 0.8)
Bias detected in 3 metrics: TPR, FPR, STP

Conclusion: your model is not fair because 2 or more criteria exceeded acceptable limits set by epsilon.

Ratios of metrics, based on 'White'. Parameter 'epsilon' was set to 0.8 and therefore metrics should be within (0.8, 1.25)
            TPR       ACC       PPV       FPR       STP
Asian  0.885965  1.021067  0.994543  0.736842  0.792023
Black  1.136842  0.950843  0.969986  1.695906  1.361823
Other  1.489474  1.008427  1.017735  3.035088  2.082621
In [60]:
# https://medium.com/responsibleml/how-to-easily-check-if-your-ml-model-is-fair-2c173419ae4c
# https://www.kdnuggets.com/2020/12/machine-learning-model-fair.html
# https://freecontent.manning.com/bias-and-fairness-in-machine-learning-part-3-building-a-bias-aware-model/
# https://freecontent.manning.com/bias-and-fairness-in-machine-learning-part-2-building-a-baseline-model-and-features/
In [61]:
train1,test1=plottable1.drop(['SALARY','Salary_below_median'],axis=1), plottable1['Salary_below_median']
test1=test1.astype('int')
X_train1, X_test1, y_train1, y_test1 = train_test_split(train1, test1, test_size=0.2, random_state=np.random.seed())


clf_1 = SGDClassifier()
# fit (train) the classifier
clf_1.fit(X_train1, y_train1)
clf_pred=clf_1.predict(X_test1)


rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train1, y_train1)
rfc_pred = rfc.predict(X_test1)

gbc = GradientBoostingClassifier(n_estimators=100)
gbc.fit(X_train1, y_train1)
gbc_pred = gbc.predict(X_test1)

clf_acc = accuracy_score(y_test1, clf_pred)
print("Linear Accuracy: " + str(clf_acc))

rfc_acc = accuracy_score(y_test1, rfc_pred)
print("Random Forest Accuracy: " + str(rfc_acc))

gbc_acc = accuracy_score(y_test1, gbc_pred)
print("Gradient Boosting Accuracy: " + str(gbc_acc))
Linear Accuracy: 0.5551039781660684
Random Forest Accuracy: 0.7976208134326145
Gradient Boosting Accuracy: 0.8128393010768638
In [62]:
X,y=plottable.drop(['SALARY','Salary_below_median'],axis=1), plottable['Salary_below_median']
y=y.astype('int')
numeric_features = ['AGE', 'OCCP', 'WKHP','HISP']

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

categorical_features = ['SEX', 'COW', 'MAR', 'RACE', "SCHL"]

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numeric_transformer, numeric_features)])

sgd_class = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', SGDClassifier(random_state=np.random.seed()))]).fit(X,y)


clf_forest = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier(random_state=np.random.seed(), max_depth=4))]).fit(X,y)

clf_gboost = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', GradientBoostingClassifier(random_state=np.random.seed()))]).fit(X,y)

clf_xgboost = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', xg.XGBClassifier(random_state=np.random.seed()))]).fit(X,y)


 # create Explainer objects 
exp_forest = dx.Explainer(clf_forest, X,y, verbose = False)
exp_xgboost = dx.Explainer(clf_xgboost, X,y, verbose = False)
exp_sgd = dx.Explainer(sgd_class, X,y, verbose=False)
exp_gboost = dx.Explainer(clf_gboost, X,y, verbose = False)

# create fairness explanations
fobject_forest = exp_forest.model_fairness(protected2, privileged2)
fobject_xgboost = exp_xgboost.model_fairness(protected2, privileged2)
fobject_sgd = exp_sgd.model_fairness(protected2, privileged2)
fobject_gboost = exp_gboost.model_fairness(protected2, privileged2)

# lets see their metric scores
fobject.plot(objects=[fobject_xgboost, fobject_forest, fobject_gboost,fobject_sgd]) #, type = "metric_scores")
In [63]:
fobject.plot(objects=[fobject_forest, fobject_xgboost,fobject_sgd], type = "metric_scores")
In [64]:
protected3 = plottable.SEX + '_' + np.where(plottable.HISP == 1, 'Non-Hispanic', 'Hispanic')
privileged3 = 'Male_Non-Hispanic'

fobject3 = exp.model_fairness(protected = protected3, privileged=privileged3)

# create fairness explanations
fobject_forest3 = exp_forest.model_fairness(protected3, privileged3)
fobject_xgboost3 = exp_xgboost.model_fairness(protected3, privileged3)
fobject_sgd3 = exp_sgd.model_fairness(protected3, privileged3)
fobject_gboost3 = exp_gboost.model_fairness(protected3, privileged3)


# lets see their metric scores
fobject3.plot(objects=[fobject_gboost3,fobject_forest3, fobject_xgboost3,fobject_sgd3]) #, type = "metric_scores")
In [65]:
protected4 = np.where(plottable.HISP == 1, 'Non-Hispanic', 'Hispanic')
privileged4 = 'Non-Hispanic'

fobject4 = exp.model_fairness(protected = protected4, privileged=privileged4)

# create fairness explanations
fobject_forest4 = exp_forest.model_fairness(protected4, privileged4)
fobject_xgboost4 = exp_xgboost.model_fairness(protected4, privileged4)
fobject_sgd4 = exp_sgd.model_fairness(protected4, privileged4)
fobject_gboost4 = exp_gboost.model_fairness(protected4, privileged4)

# lets see their metric scores
fobject4.plot(objects=[fobject_gboost4,fobject_forest4, fobject_xgboost4,fobject_sgd4])
In [66]:
protected5 = plottable.SEX
privileged5 = 'Male'

fobject5 = exp.model_fairness(protected = protected5, privileged=privileged5)

# create fairness explanations
fobject_forest5 = exp_forest.model_fairness(protected5, privileged5)
fobject_xgboost5 = exp_xgboost.model_fairness(protected5, privileged5)
fobject_sgd5 = exp_sgd.model_fairness(protected5, privileged5)
fobject_gboost5 = exp_gboost.model_fairness(protected5, privileged5)

# lets see their metric scores
fobject5.plot(objects=[fobject_gboost5,fobject_forest5,fobject_xgboost5,fobject_sgd5])
In [ ]: